This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
data <- read.csv("obesity.csv")
data$BMI <- round(data$Weight / (data$Height^2), 2)
head(data,10)
## Age Gender Height Weight CALC FAVC FCVC NCP SCC SMOKE CH2O
## 1 21 Female 1.62 64.0 no no 2 3 no no 2
## 2 21 Female 1.52 56.0 Sometimes no 3 3 yes yes 3
## 3 23 Male 1.80 77.0 Frequently no 2 3 no no 2
## 4 27 Male 1.80 87.0 Frequently no 3 3 no no 2
## 5 22 Male 1.78 89.8 Sometimes no 2 1 no no 2
## 6 29 Male 1.62 53.0 Sometimes yes 2 3 no no 2
## 7 23 Female 1.50 55.0 Sometimes yes 3 3 no no 2
## 8 22 Male 1.64 53.0 Sometimes no 2 3 no no 2
## 9 24 Male 1.78 64.0 Frequently yes 3 3 no no 2
## 10 22 Male 1.72 68.0 no yes 2 3 no no 2
## family_history_with_overweight FAF TUE CAEC MTRANS
## 1 yes 0 1 Sometimes Public_Transportation
## 2 yes 3 0 Sometimes Public_Transportation
## 3 yes 2 1 Sometimes Public_Transportation
## 4 no 2 0 Sometimes Walking
## 5 no 0 0 Sometimes Public_Transportation
## 6 no 0 0 Sometimes Automobile
## 7 yes 1 0 Sometimes Motorbike
## 8 no 3 0 Sometimes Public_Transportation
## 9 yes 1 1 Sometimes Public_Transportation
## 10 yes 1 1 Sometimes Public_Transportation
## NObeyesdad BMI
## 1 Normal_Weight 24.39
## 2 Normal_Weight 24.24
## 3 Normal_Weight 23.77
## 4 Overweight_Level_I 26.85
## 5 Overweight_Level_II 28.34
## 6 Normal_Weight 20.20
## 7 Normal_Weight 24.44
## 8 Normal_Weight 19.71
## 9 Normal_Weight 20.20
## 10 Normal_Weight 22.99
str(data)
## 'data.frame': 2111 obs. of 18 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
## $ BMI : num 24.4 24.2 23.8 26.9 28.3 ...
unique(data$NObeyesdad)
## [1] "Normal_Weight" "Overweight_Level_I" "Overweight_Level_II"
## [4] "Obesity_Type_I" "Insufficient_Weight" "Obesity_Type_II"
## [7] "Obesity_Type_III"
overweight_data <- data[data$NObeyesdad == "Overweight_Level_I", ]
# Printing the number of rows in the filtered data
print(nrow(overweight_data))
## [1] 290
pie_colors <- c("lightblue", "lightgreen", "orange", "brown", "lavender", "lightyellow", "lightpink")
target_count <- table(data$NObeyesdad)
target_count <- as.data.frame(target_count)
colnames(target_count) <- c("NObeyesdad", "Count")
target_count$Percentage <- target_count$Count / sum(target_count$Count) * 100
# Create a pie chart with ggplot2
ggplot(target_count, aes(x = "", y = Count, fill = NObeyesdad)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
scale_fill_manual(values = pie_colors) +
labs(title = "Distribution of Obesity Levels") +
theme_void() +
theme(legend.position = "right") +
guides(fill = guide_legend(title = "Obesity Levels")) +
geom_text(
aes(label = sprintf("%.1f%%", Percentage)),
position = position_stack(vjust = 0.5),
size = 4
)
# Create a bar plot for obesity types
g <- ggplot(data, aes(x=NObeyesdad))
g + geom_bar(aes(fill=NObeyesdad), width = 0.5) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Histogram on Categorical Variable",
subtitle="Distribution of Different Obesity Types",
x = "Obesity Type", y = "Count")
theme_set(theme_classic())
# Subset for specific types of obesity
obesity_type <- data[data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"), ]
head(obesity_type)
## Age Gender Height Weight CALC FAVC FCVC NCP SCC SMOKE CH2O
## 11 26 Male 1.85 105 Sometimes yes 3 3 no no 3
## 14 41 Male 1.80 99 Frequently yes 2 3 no no 2
## 18 29 Female 1.53 78 no yes 2 1 no no 2
## 22 52 Female 1.69 87 no yes 3 1 no yes 2
## 24 22 Female 1.60 82 Sometimes yes 1 1 no no 2
## 68 23 Male 1.65 95 Frequently yes 2 3 no no 2
## family_history_with_overweight FAF TUE CAEC MTRANS
## 11 yes 2 2 Frequently Public_Transportation
## 14 no 2 1 Sometimes Automobile
## 18 no 0 0 Sometimes Automobile
## 22 yes 0 0 Sometimes Automobile
## 24 yes 0 2 Sometimes Public_Transportation
## 68 yes 0 1 Always Automobile
## NObeyesdad BMI
## 11 Obesity_Type_I 30.68
## 14 Obesity_Type_I 30.56
## 18 Obesity_Type_I 33.32
## 22 Obesity_Type_I 30.46
## 24 Obesity_Type_I 32.03
## 68 Obesity_Type_I 34.89
# Subset for specific levels of overweight
overweight_level <- data[data$NObeyesdad %in% c("Overweight_Level_I", "Overweight_Level_II"), ]
head(overweight_level)
## Age Gender Height Weight CALC FAVC FCVC NCP SCC SMOKE CH2O
## 4 27 Male 1.80 87.0 Frequently no 3 3 no no 2
## 5 22 Male 1.78 89.8 Sometimes no 2 1 no no 2
## 12 21 Female 1.72 80.0 Sometimes yes 2 3 yes no 2
## 17 27 Male 1.93 102.0 Sometimes yes 2 1 no no 1
## 19 30 Female 1.71 82.0 no yes 3 4 no yes 1
## 20 23 Female 1.65 70.0 Sometimes no 2 1 no no 2
## family_history_with_overweight FAF TUE CAEC MTRANS
## 4 no 2 0 Sometimes Walking
## 5 no 0 0 Sometimes Public_Transportation
## 12 yes 2 1 Frequently Public_Transportation
## 17 yes 1 0 Sometimes Public_Transportation
## 19 yes 0 0 Frequently Automobile
## 20 yes 0 0 Sometimes Public_Transportation
## NObeyesdad BMI
## 4 Overweight_Level_I 26.85
## 5 Overweight_Level_II 28.34
## 12 Overweight_Level_II 27.04
## 17 Overweight_Level_II 27.38
## 19 Overweight_Level_II 28.04
## 20 Overweight_Level_I 25.71
others <- data[!data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III", "Overweight_Level_I", "Overweight_Level_II"), ]
head(others)
## Age Gender Height Weight CALC FAVC FCVC NCP SCC SMOKE CH2O
## 1 21 Female 1.62 64 no no 2 3 no no 2
## 2 21 Female 1.52 56 Sometimes no 3 3 yes yes 3
## 3 23 Male 1.80 77 Frequently no 2 3 no no 2
## 6 29 Male 1.62 53 Sometimes yes 2 3 no no 2
## 7 23 Female 1.50 55 Sometimes yes 3 3 no no 2
## 8 22 Male 1.64 53 Sometimes no 2 3 no no 2
## family_history_with_overweight FAF TUE CAEC MTRANS
## 1 yes 0 1 Sometimes Public_Transportation
## 2 yes 3 0 Sometimes Public_Transportation
## 3 yes 2 1 Sometimes Public_Transportation
## 6 no 0 0 Sometimes Automobile
## 7 yes 1 0 Sometimes Motorbike
## 8 no 3 0 Sometimes Public_Transportation
## NObeyesdad BMI
## 1 Normal_Weight 24.39
## 2 Normal_Weight 24.24
## 3 Normal_Weight 23.77
## 6 Normal_Weight 20.20
## 7 Normal_Weight 24.44
## 8 Normal_Weight 19.71
theme_set(theme_bw())
# plot
g <- ggplot(data = data, aes(x=Age, y= NObeyesdad,fill=NObeyesdad))
g + geom_violin() +
labs(title="Age vs NObeyesdad",
caption="Source: mpg",
x="Age",
y="NObeyesdad")
ggplot(data, aes(x=Age, fill=NObeyesdad)) +
geom_density(alpha=0.4)+
xlim(14,40)+labs(title='Density Plot for Age',x='Age',y='Density')+theme_minimal()
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(others, aes(x = Age)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Age Distribution for Normal Weight and Insufficient Types")+
theme(axis.text = element_text(size = 6))
ggplot(obesity_type, aes(x = Age)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Age Distribution for Different Obesity Types")+
theme(axis.text = element_text(size = 6))
ggplot(overweight_level, aes(x = Age)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Age Distribution for Different Overweight Types")+
theme(axis.text = element_text(size = 6))
g <- ggplot(data = data, aes(x=Weight, y= NObeyesdad,fill=NObeyesdad))
g + geom_violin() +
labs(title="Weight vs NObeyesdad",
caption="Source: mpg",
x="Weight",
y="NObeyesdad")
ggplot(data, aes(x=Weight, fill=NObeyesdad)) +
geom_density(alpha=0.4)+
xlim(30,170)+labs(title='Density Plot for Weight',x='Weight',y='Density')+theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
ggplot(others, aes(x = Weight)) +
geom_histogram(binwidth = 1, fill = "lavender", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Weight Distribution for Normal Weight and Insufficient Types")+
theme(axis.text = element_text(size = 6))
ggplot(obesity_type, aes(x = Weight)) +
geom_histogram(binwidth = 1, fill = "lavender", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Weight Distribution for Different Obesity Types")+
theme(axis.text = element_text(size = 6))
ggplot(overweight_level, aes(x = Weight)) +
geom_histogram(binwidth = 1, fill = "lavender", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Weight Distribution for Different Overweight Types")+
theme(axis.text = element_text(size = 6))
g <- ggplot(data = data, aes(x=Height, y= NObeyesdad,fill=NObeyesdad))
g + geom_violin() +
labs(title="Height vs NObeyesdad",
caption="Source: mpg",
x="Height",
y="NObeyesdad")
ggplot(data, aes(x=Height, fill=NObeyesdad)) +
geom_density(alpha=0.4)+
xlim(1.4,2.2)+labs(title='Density Plot for Height',x='Height',y='Density')+theme_minimal()
ggplot(others, aes(x = Height)) +
geom_histogram(binwidth = 1, fill = "lightgray", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Height Distribution for Normal Weight and Insufficient Types")+
theme(axis.text = element_text(size = 6))
ggplot(obesity_type, aes(x = Height)) +
geom_histogram(binwidth = 1, fill = "lightgray", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Height Distribution for Different Obesity Types")+
theme(axis.text = element_text(size = 6))
ggplot(overweight_level, aes(x = Height)) +
geom_histogram(binwidth = 1, fill = "lightgray", color = "black") +
facet_wrap(~NObeyesdad, scales = "free") +
labs(title = "Age Distribution for Different Overweight Types")+
theme(axis.text = element_text(size = 6))
ggplot(data, aes(x=BMI, fill=NObeyesdad)) +
geom_density(alpha=0.4)+
xlim(-0,60)+labs(title='Density Plot for BMI',x='BMI',y='Density')+theme_minimal()
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = Gender), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title = "Distribution of Different Obesity Types by Gender",
x = "Obesity Type", y = "Count")
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = FAVC), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title ="Distribution of Different Obesity Types by FAVC",
x = "Obesity Type", y = "Count")
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = CALC), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title = "Distribution of Different Obesity Types by CALC",
x = "Obesity Type", y = "Count")
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = SCC), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title = "Distribution of Different Obesity Types by SCC",
x = "Obesity Type", y = "Count")
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = SMOKE), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title = "Distribution of Different Obesity Types by SMOKE",
x = "Obesity Type", y = "Count")
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = CAEC), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title ="Distribution of Different Obesity Types by CAEC",
x = "Obesity Type", y = "Count")
g <- ggplot(data, aes(x = NObeyesdad))
g + geom_bar(aes(fill = MTRANS), width = 0.5, position = "dodge") +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(title = "Distribution of Different Obesity Types by MTRANS",
x = "Obesity Type", y = "Count")
ggplot(data, aes(x = FCVC, y =NObeyesdad )) +
geom_point() +
labs(title = "Scatterplot of Obesity Levels vs. FCVC",
x = "Frequency of Vegetables Taken",
y = "Obesity Levels") +
theme_minimal()
ggplot(data, aes(x = CH2O, y =NObeyesdad )) +
geom_point() +
labs(title = "Scatterplot of Obesity Levels vs. FCVC",
x = "Frequency of Water Consumed",
y = "Obesity Levels") +
theme_minimal()
ggplot(data, aes(x = FAF, y =NObeyesdad )) +
geom_point() +
labs(title = "Scatterplot of Obesity Levels vs. FCVC",
x = "Frequency of Physical Activity Done",
y = "Obesity Levels") +
theme_minimal()
ggplot(data, aes(x = TUE, y =NObeyesdad )) +
geom_point() +
labs(title = "Scatterplot of Obesity Levels vs. FCVC",
x = "Frequency of Usage of Technological Devices",
y = "Obesity Levels") +
theme_minimal()
# Convert character variables to factors
data$Gender <- factor(data$Gender, levels = unique(data$Gender))
data$CALC <- factor(data$CALC, levels = unique(data$CALC))
data$FAVC <- factor(data$FAVC, levels = unique(data$FAVC))
data$SCC <- factor(data$SCC, levels = unique(data$SCC))
data$SMOKE <- factor(data$SMOKE, levels = unique(data$SMOKE))
data$family_history_with_overweight <- factor(data$family_history_with_overweight, levels = unique(data$family_history_with_overweight))
data$CAEC <- factor(data$CAEC, levels = unique(data$CAEC))
data$MTRANS <- factor(data$MTRANS, levels = unique(data$MTRANS))
data$NObeyesdad <- factor(data$NObeyesdad, levels = unique(data$NObeyesdad))
# Check the structure of the data frame
str(data)
## 'data.frame': 2111 obs. of 18 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ CALC : Factor w/ 4 levels "no","Sometimes",..: 1 2 3 3 2 2 2 2 3 1 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ family_history_with_overweight: Factor w/ 2 levels "yes","no": 1 1 1 2 2 2 1 2 1 1 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CAEC : Factor w/ 4 levels "Sometimes","Frequently",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ MTRANS : Factor w/ 5 levels "Public_Transportation",..: 1 1 1 2 1 3 4 1 1 1 ...
## $ NObeyesdad : Factor w/ 7 levels "Normal_Weight",..: 1 1 1 2 3 1 1 1 1 1 ...
## $ BMI : num 24.4 24.2 23.8 26.9 28.3 ...
numeric_data <- data[sapply(data, is.numeric)]
correlation_matrix <- cor(numeric_data, use = "pairwise.complete.obs") # Handles missing data
# Print the correlation matrix
print(correlation_matrix)
## Age Height Weight FCVC NCP CH2O
## Age 1.00000000 -0.02595813 0.20256010 0.01629089 -0.04394373 -0.04530386
## Height -0.02595813 1.00000000 0.46313612 -0.03812106 0.24367173 0.21337592
## Weight 0.20256010 0.46313612 1.00000000 0.21612471 0.10746899 0.20057539
## FCVC 0.01629089 -0.03812106 0.21612471 1.00000000 0.04221630 0.06846147
## NCP -0.04394373 0.24367173 0.10746899 0.04221630 1.00000000 0.05708800
## CH2O -0.04530386 0.21337592 0.20057539 0.06846147 0.05708800 1.00000000
## FAF -0.14493833 0.29470900 -0.05143627 0.01993940 0.12950431 0.16723649
## TUE -0.29693059 0.05191167 -0.07156136 -0.10113485 0.03632557 0.01196534
## BMI 0.24414923 0.13178799 0.93480707 0.26364870 0.03997038 0.14418710
## FAF TUE BMI
## Age -0.14493833 -0.29693059 0.24414923
## Height 0.29470900 0.05191167 0.13178799
## Weight -0.05143627 -0.07156136 0.93480707
## FCVC 0.01993940 -0.10113485 0.26364870
## NCP 0.12950431 0.03632557 0.03997038
## CH2O 0.16723649 0.01196534 0.14418710
## FAF 1.00000000 0.05856207 -0.17752763
## TUE 0.05856207 1.00000000 -0.09970563
## BMI -0.17752763 -0.09970563 1.00000000
# Melt the correlation matrix to make it suitable for ggplot2
melted_corr <- melt(correlation_matrix)
# Create a heatmap of the correlation matrix
ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") + # Adds white borders to separate the tiles
geom_text(aes(label = sprintf("%.2f", value)), size = 3, color = "black") + # Adds correlation coefficients as labels
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
limits = c(-1, 1), space = "Lab",
name = "Correlation") + # Sets the color gradient
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.text.y = element_text(angle = 0, vjust = 0.5),
axis.title = element_blank(), # Removes axis titles
legend.position = "bottom"
) +
labs(fill = 'Correlation Coefficient') # Legend label
You can also embed plots, for example:
library(e1071)
library(gmodels)
fwd<-regsubsets(NObeyesdad~.,data=data,nvmax=16, method='forward')
summary(fwd)
## Subset selection object
## Call: regsubsets.formula(NObeyesdad ~ ., data = data, nvmax = 16, method = "forward")
## 24 Variables (and intercept)
## Forced in Forced out
## Age FALSE FALSE
## GenderMale FALSE FALSE
## Height FALSE FALSE
## Weight FALSE FALSE
## CALCSometimes FALSE FALSE
## CALCFrequently FALSE FALSE
## CALCAlways FALSE FALSE
## FAVCyes FALSE FALSE
## FCVC FALSE FALSE
## NCP FALSE FALSE
## SCCyes FALSE FALSE
## SMOKEyes FALSE FALSE
## CH2O FALSE FALSE
## family_history_with_overweightno FALSE FALSE
## FAF FALSE FALSE
## TUE FALSE FALSE
## CAECFrequently FALSE FALSE
## CAECAlways FALSE FALSE
## CAECno FALSE FALSE
## MTRANSWalking FALSE FALSE
## MTRANSAutomobile FALSE FALSE
## MTRANSMotorbike FALSE FALSE
## MTRANSBike FALSE FALSE
## BMI FALSE FALSE
## 1 subsets of each size up to 16
## Selection Algorithm: forward
## Age GenderMale Height Weight CALCSometimes CALCFrequently CALCAlways
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " "*" " "
## 8 ( 1 ) " " " " " " " " " " "*" " "
## 9 ( 1 ) " " " " " " " " " " "*" " "
## 10 ( 1 ) "*" " " " " " " " " "*" " "
## 11 ( 1 ) "*" " " " " " " " " "*" " "
## 12 ( 1 ) "*" " " " " " " " " "*" " "
## 13 ( 1 ) "*" " " " " " " " " "*" " "
## 14 ( 1 ) "*" " " "*" " " " " "*" " "
## 15 ( 1 ) "*" "*" "*" " " " " "*" " "
## 16 ( 1 ) "*" "*" "*" " " " " "*" " "
## FAVCyes FCVC NCP SCCyes SMOKEyes CH2O
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " "
## 3 ( 1 ) " " "*" "*" " " " " " "
## 4 ( 1 ) " " "*" "*" " " " " " "
## 5 ( 1 ) " " "*" "*" " " " " " "
## 6 ( 1 ) " " "*" "*" " " " " " "
## 7 ( 1 ) " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" "*" " " " " " "
## 11 ( 1 ) "*" "*" "*" " " " " " "
## 12 ( 1 ) "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" " " " "
## 14 ( 1 ) "*" "*" "*" "*" " " " "
## 15 ( 1 ) "*" "*" "*" "*" " " " "
## 16 ( 1 ) "*" "*" "*" "*" " " " "
## family_history_with_overweightno FAF TUE CAECFrequently CAECAlways
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " "*"
## 5 ( 1 ) " " " " " " " " "*"
## 6 ( 1 ) " " " " " " "*" "*"
## 7 ( 1 ) " " " " " " "*" "*"
## 8 ( 1 ) " " " " " " "*" "*"
## 9 ( 1 ) " " " " " " "*" "*"
## 10 ( 1 ) " " " " " " "*" "*"
## 11 ( 1 ) " " " " " " "*" "*"
## 12 ( 1 ) " " " " " " "*" "*"
## 13 ( 1 ) "*" " " " " "*" "*"
## 14 ( 1 ) "*" " " " " "*" "*"
## 15 ( 1 ) "*" " " " " "*" "*"
## 16 ( 1 ) "*" "*" " " "*" "*"
## CAECno MTRANSWalking MTRANSAutomobile MTRANSMotorbike MTRANSBike BMI
## 1 ( 1 ) " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*"
## 4 ( 1 ) " " " " " " " " " " "*"
## 5 ( 1 ) " " "*" " " " " " " "*"
## 6 ( 1 ) " " "*" " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*"
## 8 ( 1 ) " " "*" " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*"
## 11 ( 1 ) "*" "*" " " "*" " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" " " "*" " " "*"
## 15 ( 1 ) "*" "*" " " "*" " " "*"
## 16 ( 1 ) "*" "*" " " "*" " " "*"
bwd<-regsubsets(NObeyesdad~.,data=data,nvmax=16, method='backward')
summary(bwd)
## Subset selection object
## Call: regsubsets.formula(NObeyesdad ~ ., data = data, nvmax = 16, method = "backward")
## 24 Variables (and intercept)
## Forced in Forced out
## Age FALSE FALSE
## GenderMale FALSE FALSE
## Height FALSE FALSE
## Weight FALSE FALSE
## CALCSometimes FALSE FALSE
## CALCFrequently FALSE FALSE
## CALCAlways FALSE FALSE
## FAVCyes FALSE FALSE
## FCVC FALSE FALSE
## NCP FALSE FALSE
## SCCyes FALSE FALSE
## SMOKEyes FALSE FALSE
## CH2O FALSE FALSE
## family_history_with_overweightno FALSE FALSE
## FAF FALSE FALSE
## TUE FALSE FALSE
## CAECFrequently FALSE FALSE
## CAECAlways FALSE FALSE
## CAECno FALSE FALSE
## MTRANSWalking FALSE FALSE
## MTRANSAutomobile FALSE FALSE
## MTRANSMotorbike FALSE FALSE
## MTRANSBike FALSE FALSE
## BMI FALSE FALSE
## 1 subsets of each size up to 16
## Selection Algorithm: backward
## Age GenderMale Height Weight CALCSometimes CALCFrequently CALCAlways
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " "*" " " " " " " " "
## 4 ( 1 ) " " "*" "*" " " " " " " " "
## 5 ( 1 ) " " "*" "*" " " " " " " " "
## 6 ( 1 ) " " "*" "*" " " " " "*" " "
## 7 ( 1 ) " " "*" "*" " " " " "*" " "
## 8 ( 1 ) " " "*" "*" " " " " "*" " "
## 9 ( 1 ) " " "*" "*" " " " " "*" " "
## 10 ( 1 ) " " "*" "*" " " " " "*" " "
## 11 ( 1 ) " " "*" "*" " " " " "*" " "
## 12 ( 1 ) " " "*" "*" " " " " "*" " "
## 13 ( 1 ) " " "*" "*" " " " " "*" " "
## 14 ( 1 ) " " "*" "*" " " " " "*" " "
## 15 ( 1 ) " " "*" "*" " " " " "*" " "
## 16 ( 1 ) " " "*" "*" " " " " "*" " "
## FAVCyes FCVC NCP SCCyes SMOKEyes CH2O
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " " "
## 8 ( 1 ) " " "*" "*" " " " " " "
## 9 ( 1 ) " " "*" "*" " " " " " "
## 10 ( 1 ) " " "*" "*" "*" " " " "
## 11 ( 1 ) " " "*" "*" "*" " " " "
## 12 ( 1 ) " " "*" "*" "*" " " " "
## 13 ( 1 ) " " "*" "*" "*" " " " "
## 14 ( 1 ) " " "*" "*" "*" " " " "
## 15 ( 1 ) "*" "*" "*" "*" " " " "
## 16 ( 1 ) "*" "*" "*" "*" " " " "
## family_history_with_overweightno FAF TUE CAECFrequently CAECAlways
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " "*" " "
## 6 ( 1 ) " " " " " " "*" " "
## 7 ( 1 ) " " " " " " "*" " "
## 8 ( 1 ) " " " " " " "*" " "
## 9 ( 1 ) " " " " " " "*" "*"
## 10 ( 1 ) " " " " " " "*" "*"
## 11 ( 1 ) " " "*" " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" "*"
## 14 ( 1 ) "*" "*" " " "*" "*"
## 15 ( 1 ) "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" " " "*" "*"
## CAECno MTRANSWalking MTRANSAutomobile MTRANSMotorbike MTRANSBike BMI
## 1 ( 1 ) " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*"
## 4 ( 1 ) " " " " " " " " " " "*"
## 5 ( 1 ) " " " " " " " " " " "*"
## 6 ( 1 ) " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*"
## 8 ( 1 ) " " "*" " " " " " " "*"
## 9 ( 1 ) " " "*" " " " " " " "*"
## 10 ( 1 ) " " "*" " " " " " " "*"
## 11 ( 1 ) " " "*" " " " " " " "*"
## 12 ( 1 ) " " "*" " " " " " " "*"
## 13 ( 1 ) "*" "*" " " " " " " "*"
## 14 ( 1 ) "*" "*" " " "*" " " "*"
## 15 ( 1 ) "*" "*" " " "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" " " "*"
set.seed(100515)
index<- sample(2,nrow(data),prob = c(0.8,0.2),replace=TRUE)
train_data=data[index==1,]
test_data<-data[index==2,]
print(dim(train_data))
## [1] 1688 18
print(dim(test_data))
## [1] 423 18
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)
Model1 <- multinom(NObeyesdad ~., data = train_data)
## # weights: 182 (150 variable)
## initial value 3284.696332
## iter 10 value 2606.909180
## iter 20 value 1892.274122
## iter 30 value 1533.229167
## iter 40 value 1107.786517
## iter 50 value 871.147048
## iter 60 value 411.798632
## iter 70 value 176.335674
## iter 80 value 118.956333
## iter 90 value 81.502219
## iter 100 value 59.470026
## final value 59.470026
## stopped after 100 iterations
summary(Model1)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = NObeyesdad ~ ., data = train_data)
##
## Coefficients:
## (Intercept) Age GenderMale Height Weight
## Overweight_Level_I -182.3786 0.2529941 -6.4785744 -56.18251 0.9102755
## Overweight_Level_II -211.3650 0.6120244 -7.2895805 -157.95321 2.4190094
## Obesity_Type_I -232.2732 0.5558998 -9.7320449 -372.68321 4.3410508
## Insufficient_Weight 154.8807 0.3325656 0.7217008 16.16593 -0.2509415
## Obesity_Type_II -315.5912 4.6296748 52.2773475 -708.72300 9.7179727
## Obesity_Type_III -418.1942 -0.2880451 -69.8820954 -658.32970 9.4335899
## CALCSometimes CALCFrequently CALCAlways FAVCyes
## Overweight_Level_I 1.132252 -1.083029 -22.836959 2.421359
## Overweight_Level_II -2.741616 1.365960 -54.422484 -1.063541
## Obesity_Type_I -5.087807 14.189148 -30.260675 12.922710
## Insufficient_Weight 2.280609 6.978922 -30.082807 3.237624
## Obesity_Type_II -12.841075 -32.680757 6.395896 -34.555268
## Obesity_Type_III 66.152052 45.112703 14.785060 75.860435
## FCVC NCP SCCyes SMOKEyes CH2O
## Overweight_Level_I -3.939712 -1.1948378 10.494900 -7.517610 0.5330821
## Overweight_Level_II -5.015420 -2.1645005 8.219942 -2.051327 0.4835274
## Obesity_Type_I -3.907522 -2.5717224 21.246439 13.459818 4.6597414
## Insufficient_Weight 1.108195 0.8966696 -1.712377 2.602751 1.0004423
## Obesity_Type_II -16.681017 -10.9660861 -57.148410 28.789593 -16.4304130
## Obesity_Type_III 63.173260 2.3410196 77.497622 24.089598 -2.4320917
## family_history_with_overweightno FAF TUE
## Overweight_Level_I 0.04288806 -2.808472 0.46743086
## Overweight_Level_II -2.89618701 -3.205545 2.48445960
## Obesity_Type_I -12.28770953 -1.874315 5.24067206
## Insufficient_Weight -0.11420639 1.962234 0.05586729
## Obesity_Type_II 31.45269010 -13.305927 8.59870415
## Obesity_Type_III 28.48463822 -23.632731 -18.53965520
## CAECFrequently CAECAlways CAECno MTRANSWalking
## Overweight_Level_I -4.732421 -9.646568 -6.396555 -0.04458058
## Overweight_Level_II -4.286391 -18.199771 -1.313567 -12.01643706
## Obesity_Type_I -15.312871 -20.839442 -56.786779 -15.28074955
## Insufficient_Weight 5.615335 -8.622133 9.596129 5.80202811
## Obesity_Type_II -17.182757 -7.114496 17.118648 -6.58940771
## Obesity_Type_III 104.944912 -82.673399 4.821168 -8.71275412
## MTRANSAutomobile MTRANSMotorbike MTRANSBike BMI
## Overweight_Level_I -0.8613348 -5.084285 5.43197 8.95369
## Overweight_Level_II -6.4947246 -198.931318 -243.61929 12.16491
## Obesity_Type_I -5.5876684 -34.992783 -107.16045 18.66597
## Insufficient_Weight 0.9031987 -120.331380 -192.83667 -10.24804
## Obesity_Type_II -39.3816337 -156.812556 -13.07749 21.23245
## Obesity_Type_III 19.4732124 -106.107607 82.04172 15.21788
##
## Std. Errors:
## (Intercept) Age GenderMale Height Weight
## Overweight_Level_I 5.0618044 0.1844556 2.850205 8.794632 0.1777486
## Overweight_Level_II 4.6924854 0.2164840 3.065959 8.099412 0.2312679
## Obesity_Type_I 3.0550627 0.4000755 6.508188 5.321353 0.3130731
## Insufficient_Weight 0.9235888 0.2818630 4.915184 1.607537 0.2961229
## Obesity_Type_II 2.2837843 0.7981901 8.736016 4.126346 0.4653116
## Obesity_Type_III 1.6462039 1.7703113 3.118178 2.989828 0.5904833
## CALCSometimes CALCFrequently CALCAlways FAVCyes FCVC
## Overweight_Level_I 2.131739 3.473127 NaN 1.868712 1.415400
## Overweight_Level_II 2.447281 3.740452 8.427990e-12 2.183088 1.713347
## Obesity_Type_I 3.464669 5.527589 NaN 5.644219 3.776649
## Insufficient_Weight 4.767213 3.779049 3.694472e-15 4.132707 2.904407
## Obesity_Type_II 6.726885 7.887031 NaN 7.176887 5.206766
## Obesity_Type_III 3.104725 5.088797 0.000000e+00 1.646204 1.395979
## NCP SCCyes SMOKEyes CH2O
## Overweight_Level_I 0.7936564 2.273622e+00 4.881380 1.383474
## Overweight_Level_II 0.9619816 3.587324e+00 5.707932 1.596154
## Obesity_Type_I 2.5792073 5.887990e+00 5.761969 3.323166
## Insufficient_Weight 1.7777368 5.117697e+00 5.849366 2.868054
## Obesity_Type_II 3.9290328 5.887994e+00 5.885718 4.811075
## Obesity_Type_III 4.5223343 5.083847e-05 5.088770 7.489395
## family_history_with_overweightno FAF TUE
## Overweight_Level_I 1.550134 0.7412366 1.290735
## Overweight_Level_II 2.462920 0.9002774 1.549537
## Obesity_Type_I 7.866363 3.1133119 3.351732
## Insufficient_Weight 2.710056 1.2431082 1.848191
## Obesity_Type_II 9.994182 4.3431789 4.110801
## Obesity_Type_III 3.182921 2.9775669 7.449237
## CAECFrequently CAECAlways CAECno MTRANSWalking
## Overweight_Level_I 2.029075 2.295808e+00 5.275273e+00 2.085344e+00
## Overweight_Level_II 2.870639 5.670235e+00 4.575031e+00 4.108965e+00
## Obesity_Type_I 7.427007 3.376891e+00 7.517252e+00 7.474706e+00
## Insufficient_Weight 3.034330 4.189380e+00 6.409360e-05 1.620153e+01
## Obesity_Type_II 3.469713 4.385610e+00 5.885693e+00 9.735060e-24
## Obesity_Type_III 5.150046 1.495534e-44 6.216827e-06 4.178281e-08
## MTRANSAutomobile MTRANSMotorbike MTRANSBike BMI
## Overweight_Level_I 2.644876 3.753680e+00 4.525076e+00 0.6620998
## Overweight_Level_II 3.375591 8.748933e-14 NaN 0.5434728
## Obesity_Type_I 7.977993 3.730855e+00 1.517699e-14 1.0203203
## Insufficient_Weight 4.572389 6.585211e-16 4.856742e-16 1.1946205
## Obesity_Type_II 7.929581 4.047698e-29 1.080400e-30 1.2326828
## Obesity_Type_III 5.762076 4.380917e-20 2.067739e-36 1.7119664
##
## Residual Deviance: 118.9401
## AIC: 418.9401
test_predictions <- predict(Model1, newdata = test_data)
test_actual_classes <- test_data$NObeyesdad
multinom_accuracy <- mean(test_predictions == test_actual_classes)
print(paste("Testing Accuracy of the multinomial logistic regression model:", multinom_accuracy))
## [1] "Testing Accuracy of the multinomial logistic regression model: 0.945626477541371"
naive_model <- naiveBayes(NObeyesdad ~., data = train_data)
summary(naive_model)
## Length Class Mode
## apriori 7 table numeric
## tables 17 -none- list
## levels 7 -none- character
## isnumeric 17 -none- logical
## call 4 -none- call
predictions <- predict(naive_model, test_data)
# Calculate accuracy
actual <- test_data$NObeyesdad # The actual labels
correct <- sum(predictions == actual) # Count correct predictions
nb_accuracy <- correct / length(actual) # Calculate accuracy
# Print the accuracy
print(paste("Accuracy of the Naive Bayes model:", nb_accuracy))
## [1] "Accuracy of the Naive Bayes model: 0.936170212765957"
lda_model <- lda(NObeyesdad~., train_data)
summary(lda_model)
## Length Class Mode
## prior 7 -none- numeric
## counts 7 -none- numeric
## means 168 -none- numeric
## scaling 144 -none- numeric
## lev 7 -none- character
## svd 6 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 8 -none- list
predictions <- predict(lda_model, newdata = test_data)
predicted_labels <- predictions$class
actual_labels <- test_data$NObeyesdad
lda_accuracy <- sum(predicted_labels == actual_labels) / length(actual_labels)
# Print the accuracy
print(paste("Accuracy of the LDA model:", lda_accuracy))
## [1] "Accuracy of the LDA model: 0.898345153664303"
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
actual <- test_data$NObeyesdad # The actual labels
correct <- sum(predictions == actual) # Count correct predictions
accuracy <- correct / length(actual) # Calculate accuracy
# Impute missing values with mean
# Fit Random Forest model on imputed data
rf_model_imputed <- randomForest(NObeyesdad ~ ., data = train_data)
# Predict using Random Forest model on test data
rf_pred_imputed <- predict(rf_model_imputed, newdata = test_data)
# Evaluate Random Forest model on imputed data
rf_accuracy <- mean(rf_pred_imputed == test_data$NObeyesdad)
print(paste("Random Forest Accuracy (with imputed data):", rf_accuracy))
## [1] "Random Forest Accuracy (with imputed data): 0.99290780141844"
svm_model <- svm(NObeyesdad ~ ., data = train_data)
svm_pred <- predict(svm_model, newdata = test_data)
svm_accuracy <- mean(svm_pred == test_data$NObeyesdad)
print(paste("SVM Accuracy:", svm_accuracy))
## [1] "SVM Accuracy: 0.933806146572104"
library(rpart)
library(rpart.plot)
library(nnet)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(class)
# Fit the decision tree model using rpart
tree_model <- rpart(NObeyesdad ~ ., data = train_data, method = "class")
# Plot the decision tree
rpart.plot(tree_model, type = 4, extra = 101, fallen.leaves = TRUE)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 7 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.
tree_predictions <- predict(tree_model, newdata = test_data, type = "class")
tree_actual_classes <- test_data$NObeyesdad
# Calculate accuracy: proportion of correct predictions
tree_accuracy <- mean(tree_predictions == tree_actual_classes)
# Print the testing accuracy
print(paste("Testing Accuracy of the decision tree model:", tree_accuracy))
## [1] "Testing Accuracy of the decision tree model: 0.969267139479905"
rpart.plot(tree_model, type = 4, extra = 101, fallen.leaves = TRUE)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 7 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.
# Normalize the numeric features
preproc <- preProcess(train_data[, sapply(train_data, is.numeric)], method = c("center", "scale"))
train_data_norm <- predict(preproc, train_data)
test_data_norm <- predict(preproc, test_data)
nn_model <- nnet(NObeyesdad ~ ., data = train_data_norm, size = 3, maxit = 200)
## # weights: 103
## initial value 3650.939100
## iter 10 value 1411.263845
## iter 20 value 1073.807853
## iter 30 value 875.898472
## iter 40 value 669.258917
## iter 50 value 533.208453
## iter 60 value 446.811551
## iter 70 value 385.815436
## iter 80 value 362.376371
## iter 90 value 348.339412
## iter 100 value 299.648481
## iter 110 value 195.205075
## iter 120 value 149.561059
## iter 130 value 124.924237
## iter 140 value 115.659332
## iter 150 value 113.261008
## iter 160 value 108.820121
## iter 170 value 106.471205
## iter 180 value 104.421707
## iter 190 value 102.559832
## iter 200 value 101.989389
## final value 101.989389
## stopped after 200 iterations
# Predict using the neural network model on test data
nn_predictions <- predict(nn_model, newdata = test_data_norm, type = "class")
nn_predictions <- factor(nn_predictions, levels = levels(test_data$NObeyesdad))
# Calculate accuracy
nn_accuracy <- mean(nn_predictions == test_data$NObeyesdad)
print(paste("Neural Network Accuracy:", nn_accuracy))
## [1] "Neural Network Accuracy: 0.955082742316785"
# Calculate confusion matrix
conf_matrix <- confusionMatrix(nn_predictions, test_data$NObeyesdad)
# Calculate sensitivity and specificity
nn_sensitivity <- conf_matrix$byClass["Sensitivity"]
nn_specificity <- conf_matrix$byClass["Specificity"]
print(paste("Neural Network Sensitivity:", nn_sensitivity))
## [1] "Neural Network Sensitivity: NA"
print(paste("Neural Network Specificity:", nn_specificity))
## [1] "Neural Network Specificity: NA"
train_data_normalized <- as.data.frame(scale(train_data[ , sapply(train_data, is.numeric)]))
test_data_normalized <- as.data.frame(scale(test_data[ , sapply(test_data, is.numeric)]))
# Train a kNN model
knn_model <- knn(train = train_data_normalized, test = test_data_normalized, cl = train_data$NObeyesdad, k = 5)
# Convert predictions and actual values to factors with the same levels
knn_model <- factor(knn_model, levels = levels(test_data$NObeyesdad))
# Calculate accuracy for kNN model
knn_accuracy <- mean(knn_model == test_data$NObeyesdad)
print(paste("kNN Accuracy:", knn_accuracy))
## [1] "kNN Accuracy: 0.855791962174941"
# Calculate confusion matrix for kNN model
conf_matrix_knn <- confusionMatrix(knn_model, test_data$NObeyesdad)
# Calculate sensitivity and specificity for kNN model
knn_sensitivity <- conf_matrix_knn$byClass["Sensitivity"]
knn_specificity <- conf_matrix_knn$byClass["Specificity"]
print(paste("kNN Sensitivity:", knn_sensitivity))
## [1] "kNN Sensitivity: NA"
print(paste("kNN Specificity:", knn_specificity))
## [1] "kNN Specificity: NA"
# Combine all model accuracies for comparison
models <- c("Multinomial Logistic Regression", "Naive Bayes", "LDA", "Random Forest", "SVM", "Decision Tree", "Neural Network", "kNN")
test_accuracies <- c(multinom_accuracy, nb_accuracy, lda_accuracy, rf_accuracy, svm_accuracy, tree_accuracy, nn_accuracy, knn_accuracy)
# Create a data frame
accuracy_data <- data.frame(Model = models, Accuracy = test_accuracies)
library(RColorBrewer)
# Define a good color palette
palette_colors <- brewer.pal(n = length(unique(accuracy_data$Model)), name = "Set3")
# Sort the accuracy dataframe by Accuracy in descending order
sorted_accuracy_data <- accuracy_data[order(-accuracy_data$Accuracy), ]
# Print the sorted accuracy dataframe
print(sorted_accuracy_data)
## Model Accuracy
## 4 Random Forest 0.9929078
## 6 Decision Tree 0.9692671
## 7 Neural Network 0.9550827
## 1 Multinomial Logistic Regression 0.9456265
## 2 Naive Bayes 0.9361702
## 5 SVM 0.9338061
## 3 LDA 0.8983452
## 8 kNN 0.8557920
# Initialize vectors to store TP, TN, FP, FN
TP <- numeric(length(models))
TN <- numeric(length(models))
FP <- numeric(length(models))
FN <- numeric(length(models))
# Function to calculate TP, TN, FP, FN
calculate_metrics <- function(predictions, actual) {
TP <- sum(predictions == actual & predictions == "Obesity_Type_I")
TN <- sum(predictions != actual & predictions != "Obesity_Type_I")
FP <- sum(predictions == "Obesity_Type_I" & actual != "Obesity_Type_I")
FN <- sum(predictions != "Obesity_Type_I" & actual == "Obesity_Type_I")
return(c(TP = TP, TN = TN, FP = FP, FN = FN))
}
# Calculate TP, TN, FP, FN for each model
for (i in 1:length(models)) {
if (models[i] == "Multinomial Logistic Regression") {
metrics <- calculate_metrics(test_predictions, test_actual_classes)
} else if (models[i] == "Naive Bayes") {
metrics <- calculate_metrics(predictions, actual)
} else if (models[i] == "LDA") {
metrics <- calculate_metrics(predicted_labels, actual_labels)
} else if (models[i] == "Random Forest") {
metrics <- calculate_metrics(rf_pred_imputed, test_data$NObeyesdad)
} else if (models[i] == "SVM") {
metrics <- calculate_metrics(svm_pred, test_data$NObeyesdad)
} else if (models[i] == "Decision Tree") {
metrics <- calculate_metrics(tree_predictions, tree_actual_classes)
}
TP[i] <- metrics["TP"]
TN[i] <- metrics["TN"]
FP[i] <- metrics["FP"]
FN[i] <- metrics["FN"]
}
# Calculate sensitivity and specificity
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)
# Create a dataframe to store results
metrics_data <- data.frame(Model = models, Accuracy=test_accuracies,Sensitivity = sensitivity, Specificity = specificity)
sorted_metrics_data <- metrics_data %>% arrange(desc(Accuracy))
# Print the sorted metrics data frame
print(sorted_metrics_data)
## Model Accuracy Sensitivity Specificity
## 1 Random Forest 0.9929078 1.000000 1.0000000
## 2 Decision Tree 0.9692671 0.937500 0.9230769
## 3 Neural Network 0.9550827 0.937500 0.9230769
## 4 Multinomial Logistic Regression 0.9456265 0.921875 0.9565217
## 5 Naive Bayes 0.9361702 0.000000 1.0000000
## 6 SVM 0.9338061 0.937500 0.8928571
## 7 LDA 0.8983452 0.906250 0.9534884
## 8 kNN 0.8557920 0.937500 0.9230769
# Print the dataframe
print(metrics_data)
## Model Accuracy Sensitivity Specificity
## 1 Multinomial Logistic Regression 0.9456265 0.921875 0.9565217
## 2 Naive Bayes 0.9361702 0.000000 1.0000000
## 3 LDA 0.8983452 0.906250 0.9534884
## 4 Random Forest 0.9929078 1.000000 1.0000000
## 5 SVM 0.9338061 0.937500 0.8928571
## 6 Decision Tree 0.9692671 0.937500 0.9230769
## 7 Neural Network 0.9550827 0.937500 0.9230769
## 8 kNN 0.8557920 0.937500 0.9230769
ggplot(metrics_data, aes(x = Model, y = Sensitivity, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Sensitivity of Different Models",
x = "Model", y = "Sensitivity") +
scale_fill_manual(values = palette_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot Specificity
ggplot(metrics_data, aes(x = Model, y = Specificity, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Specificity of Different Models",
x = "Model", y = "Specificity") +
scale_fill_manual(values = palette_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot Accuracy (if not already included)
ggplot(metrics_data, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Accuracy of Different Models",
x = "Model", y = "Accuracy") +
scale_fill_manual(values = palette_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Load necessary libraries
# Set seed for reproducibility
set.seed(123)
# Define the number of folds
k <- 5
train_control <- trainControl(method = "cv", number = k)
# Function to perform k-fold cross-validation for different models
evaluate_model <- function(model_formula, method, train_data, train_control) {
train(model_formula, data = train_data, method = method, trControl = train_control)
}
# Function to calculate metrics: accuracy, sensitivity, specificity
calculate_metrics <- function(predictions, actual) {
conf_matrix <- confusionMatrix(predictions, actual)
accuracy <- conf_matrix$overall["Accuracy"]
return(c(accuracy))
}
# Initialize lists to store metrics for each model
model_names <- c("Multinomial Logistic Regression", "Naive Bayes", "LDA", "Random Forest", "SVM", "Decision Tree", "Neural Network", "kNN")
accuracies <- c()
# Multinomial Logistic Regression
multinom_model <- evaluate_model(NObeyesdad ~ ., "multinom", train_data, train_control)
## # weights: 182 (150 variable)
## initial value 2628.924611
## iter 10 value 2022.032314
## iter 20 value 1476.583416
## iter 30 value 1130.422179
## iter 40 value 873.014855
## iter 50 value 598.325272
## iter 60 value 287.562506
## iter 70 value 111.021262
## iter 80 value 60.743904
## iter 90 value 43.007848
## iter 100 value 38.196909
## final value 38.196909
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2628.924611
## iter 10 value 2022.084832
## iter 20 value 1480.441608
## iter 30 value 1157.901521
## iter 40 value 940.374510
## iter 50 value 768.841570
## iter 60 value 674.456690
## iter 70 value 654.967581
## iter 80 value 641.823023
## iter 90 value 635.637389
## iter 100 value 629.406062
## final value 629.406062
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2628.924611
## iter 10 value 2022.032367
## iter 20 value 1476.587298
## iter 30 value 1130.452999
## iter 40 value 873.111503
## iter 50 value 599.158133
## iter 60 value 290.647944
## iter 70 value 134.039395
## iter 80 value 100.967698
## iter 90 value 89.746072
## iter 100 value 87.104075
## final value 87.104075
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2626.978701
## iter 10 value 2080.944367
## iter 20 value 1445.668432
## iter 30 value 1082.846426
## iter 40 value 843.809978
## iter 50 value 572.939272
## iter 60 value 283.817472
## iter 70 value 102.233932
## iter 80 value 62.729350
## iter 90 value 53.936195
## iter 100 value 46.971832
## final value 46.971832
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2626.978701
## iter 10 value 2081.031363
## iter 20 value 1451.884487
## iter 30 value 1116.563199
## iter 40 value 912.322646
## iter 50 value 759.804146
## iter 60 value 655.330044
## iter 70 value 635.162891
## iter 80 value 621.033980
## iter 90 value 612.253015
## iter 100 value 606.604510
## final value 606.604510
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2626.978701
## iter 10 value 2080.944454
## iter 20 value 1445.674700
## iter 30 value 1082.885861
## iter 40 value 843.908006
## iter 50 value 573.986096
## iter 60 value 286.964342
## iter 70 value 129.856370
## iter 80 value 102.594690
## iter 90 value 91.776011
## iter 100 value 87.523063
## final value 87.523063
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2628.924611
## iter 10 value 2064.001874
## iter 20 value 1466.859383
## iter 30 value 1112.414341
## iter 40 value 856.578882
## iter 50 value 542.549274
## iter 60 value 295.561693
## iter 70 value 84.283450
## iter 80 value 53.332430
## iter 90 value 30.587717
## iter 100 value 15.081132
## final value 15.081132
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2628.924611
## iter 10 value 2064.094789
## iter 20 value 1471.098043
## iter 30 value 1140.958381
## iter 40 value 916.617940
## iter 50 value 765.774797
## iter 60 value 676.937397
## iter 70 value 648.758164
## iter 80 value 632.700629
## iter 90 value 622.703277
## iter 100 value 617.407475
## final value 617.407475
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2628.924611
## iter 10 value 2064.001967
## iter 20 value 1466.863652
## iter 30 value 1112.446162
## iter 40 value 856.658245
## iter 50 value 543.163928
## iter 60 value 297.908165
## iter 70 value 116.022877
## iter 80 value 96.494080
## iter 90 value 85.566996
## iter 100 value 82.292844
## final value 82.292844
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2632.816432
## iter 10 value 2051.787661
## iter 20 value 1527.858728
## iter 30 value 1212.439635
## iter 40 value 883.763701
## iter 50 value 613.004629
## iter 60 value 280.073439
## iter 70 value 108.124422
## iter 80 value 50.876338
## iter 90 value 37.378869
## iter 100 value 32.414896
## final value 32.414896
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2632.816432
## iter 10 value 2051.840921
## iter 20 value 1530.661364
## iter 30 value 1234.112656
## iter 40 value 947.565063
## iter 50 value 785.193342
## iter 60 value 675.867146
## iter 70 value 644.442538
## iter 80 value 628.342206
## iter 90 value 621.867839
## iter 100 value 613.348798
## final value 613.348798
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2632.816432
## iter 10 value 2051.787714
## iter 20 value 1527.861549
## iter 30 value 1212.463626
## iter 40 value 883.854251
## iter 50 value 613.499688
## iter 60 value 283.696067
## iter 70 value 132.743227
## iter 80 value 98.283742
## iter 90 value 89.585819
## iter 100 value 87.253416
## final value 87.253416
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2621.140971
## iter 10 value 2082.735255
## iter 20 value 1502.977748
## iter 30 value 1107.099412
## iter 40 value 882.944761
## iter 50 value 592.586249
## iter 60 value 324.505584
## iter 70 value 119.986628
## iter 80 value 64.846638
## iter 90 value 47.442752
## iter 100 value 40.335023
## final value 40.335023
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2621.140971
## iter 10 value 2082.824393
## iter 20 value 1507.248312
## iter 30 value 1137.810343
## iter 40 value 948.454438
## iter 50 value 799.149281
## iter 60 value 688.321741
## iter 70 value 659.904019
## iter 80 value 642.706226
## iter 90 value 633.865181
## iter 100 value 628.888377
## final value 628.888377
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 2621.140971
## iter 10 value 2082.735345
## iter 20 value 1502.982046
## iter 30 value 1107.133822
## iter 40 value 883.037432
## iter 50 value 593.060514
## iter 60 value 327.423481
## iter 70 value 143.443113
## iter 80 value 109.867335
## iter 90 value 96.443348
## iter 100 value 92.889726
## final value 92.889726
## stopped after 100 iterations
## # weights: 182 (150 variable)
## initial value 3284.696332
## iter 10 value 2606.909296
## iter 20 value 1892.277477
## iter 30 value 1533.263093
## iter 40 value 1107.866374
## iter 50 value 871.352857
## iter 60 value 413.905956
## iter 70 value 193.877492
## iter 80 value 149.047500
## iter 90 value 119.994103
## iter 100 value 108.974723
## final value 108.974723
## stopped after 100 iterations
multinom_predictions <- predict(multinom_model, newdata = test_data)
metrics <- calculate_metrics(multinom_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Naive Bayes
nb_model <- evaluate_model(NObeyesdad ~ ., "nb", train_data, train_control)
## Warning: model fit failed for Fold1: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) :
## Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold2: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) :
## Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECFrequently, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold3: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) :
## Zero variances for at least one class in variables: GenderMale, CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, SMOKEyes, family_history_with_overweightno, CAECFrequently, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold4: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) :
## Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning: model fit failed for Fold5: usekernel=FALSE, fL=0, adjust=1 Error in NaiveBayes.default(x, y, usekernel = FALSE, fL = param$fL, ...) :
## Zero variances for at least one class in variables: CALCSometimes, CALCFrequently, CALCAlways, FAVCyes, FCVC, NCP, SCCyes, family_history_with_overweightno, CAECAlways, CAECno, MTRANSWalking, MTRANSAutomobile, MTRANSMotorbike, MTRANSBike
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
nb_predictions <- predict(nb_model, newdata = test_data)
metrics <- calculate_metrics(nb_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Linear Discriminant Analysis (LDA)
lda_model <- evaluate_model(NObeyesdad ~ ., "lda", train_data, train_control)
## Warning: model fit failed for Fold3: parameter=none Error in lda.default(x, grouping, ...) :
## variable 7 appears to be constant within groups
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
lda_predictions <- predict(lda_model, newdata = test_data)
metrics <- calculate_metrics(lda_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Random Forest
rf_model <- evaluate_model(NObeyesdad ~ ., "rf", train_data, train_control)
rf_predictions <- predict(rf_model, newdata = test_data)
metrics <- calculate_metrics(rf_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Support Vector Machine (SVM)
svm_model <- evaluate_model(NObeyesdad ~ ., "svmRadial", train_data, train_control)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
svm_predictions <- predict(svm_model, newdata = test_data)
metrics <- calculate_metrics(svm_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Decision Tree
tree_model <- evaluate_model(NObeyesdad ~ ., "rpart", train_data, train_control)
tree_predictions <- predict(tree_model, newdata = test_data)
metrics <- calculate_metrics(tree_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Neural Network
nn_model <- evaluate_model(NObeyesdad ~ ., "nnet", train_data_norm, train_control)
## # weights: 39
## initial value 2757.052505
## iter 10 value 2150.572080
## iter 20 value 1738.812687
## iter 30 value 1573.411806
## iter 40 value 1262.280002
## iter 50 value 943.750268
## iter 60 value 540.621389
## iter 70 value 311.800422
## iter 80 value 182.178722
## iter 90 value 149.138122
## iter 100 value 148.211097
## final value 148.211097
## stopped after 100 iterations
## # weights: 103
## initial value 2812.049289
## iter 10 value 1068.392086
## iter 20 value 715.177290
## iter 30 value 509.472409
## iter 40 value 366.865758
## iter 50 value 276.762521
## iter 60 value 208.662877
## iter 70 value 169.885574
## iter 80 value 112.340999
## iter 90 value 80.545602
## iter 100 value 63.320302
## final value 63.320302
## stopped after 100 iterations
## # weights: 167
## initial value 2899.181146
## iter 10 value 1283.911083
## iter 20 value 471.664967
## iter 30 value 341.265617
## iter 40 value 274.723808
## iter 50 value 240.575636
## iter 60 value 193.436357
## iter 70 value 129.370969
## iter 80 value 104.911829
## iter 90 value 82.549131
## iter 100 value 64.127179
## final value 64.127179
## stopped after 100 iterations
## # weights: 39
## initial value 2871.408575
## iter 10 value 2108.527502
## iter 20 value 2066.421141
## iter 30 value 1981.766469
## iter 40 value 1966.527595
## iter 50 value 1961.502114
## iter 60 value 1921.954233
## iter 70 value 1776.339246
## iter 80 value 1470.525051
## iter 90 value 1321.476920
## iter 100 value 1232.423641
## final value 1232.423641
## stopped after 100 iterations
## # weights: 103
## initial value 2739.804436
## iter 10 value 1604.616995
## iter 20 value 1163.885922
## iter 30 value 923.539212
## iter 40 value 818.855410
## iter 50 value 733.430644
## iter 60 value 657.426918
## iter 70 value 571.240473
## iter 80 value 537.379120
## iter 90 value 528.417072
## iter 100 value 522.578559
## final value 522.578559
## stopped after 100 iterations
## # weights: 167
## initial value 2713.524814
## iter 10 value 1067.198953
## iter 20 value 705.482302
## iter 30 value 577.084423
## iter 40 value 483.706011
## iter 50 value 440.465485
## iter 60 value 398.380065
## iter 70 value 364.898957
## iter 80 value 327.739235
## iter 90 value 307.986792
## iter 100 value 300.592349
## final value 300.592349
## stopped after 100 iterations
## # weights: 39
## initial value 2673.060718
## iter 10 value 1683.486928
## iter 20 value 1503.796444
## iter 30 value 1353.583265
## iter 40 value 1140.341397
## iter 50 value 1005.277751
## iter 60 value 964.898034
## iter 70 value 905.706952
## iter 80 value 784.450074
## iter 90 value 735.389156
## iter 100 value 692.023416
## final value 692.023416
## stopped after 100 iterations
## # weights: 103
## initial value 2874.976897
## iter 10 value 1028.377478
## iter 20 value 714.571280
## iter 30 value 492.812832
## iter 40 value 356.672487
## iter 50 value 273.887143
## iter 60 value 186.504953
## iter 70 value 131.881460
## iter 80 value 107.648105
## iter 90 value 99.608466
## iter 100 value 91.756917
## final value 91.756917
## stopped after 100 iterations
## # weights: 167
## initial value 2905.094099
## iter 10 value 1292.929354
## iter 20 value 750.743715
## iter 30 value 469.328958
## iter 40 value 354.793480
## iter 50 value 285.550544
## iter 60 value 235.723471
## iter 70 value 201.220640
## iter 80 value 185.255934
## iter 90 value 175.031737
## iter 100 value 148.429877
## final value 148.429877
## stopped after 100 iterations
## # weights: 39
## initial value 2748.705423
## iter 10 value 1944.827380
## iter 20 value 1720.510218
## iter 30 value 1662.463335
## iter 40 value 1637.943100
## iter 50 value 1619.487103
## iter 60 value 1597.760294
## iter 70 value 1456.264795
## iter 80 value 1208.634245
## iter 90 value 908.677922
## iter 100 value 575.503797
## final value 575.503797
## stopped after 100 iterations
## # weights: 103
## initial value 2871.198574
## iter 10 value 1274.581335
## iter 20 value 866.216198
## iter 30 value 654.808645
## iter 40 value 461.192525
## iter 50 value 325.701488
## iter 60 value 183.181852
## iter 70 value 113.549794
## iter 80 value 102.657361
## iter 90 value 101.245801
## iter 100 value 98.202964
## final value 98.202964
## stopped after 100 iterations
## # weights: 167
## initial value 2696.206235
## iter 10 value 1071.220057
## iter 20 value 640.476537
## iter 30 value 353.227603
## iter 40 value 223.640902
## iter 50 value 164.263198
## iter 60 value 109.006257
## iter 70 value 71.817934
## iter 80 value 54.098330
## iter 90 value 42.288665
## iter 100 value 37.120007
## final value 37.120007
## stopped after 100 iterations
## # weights: 39
## initial value 2723.347436
## iter 10 value 2430.862254
## iter 20 value 2176.361441
## iter 30 value 2017.692920
## iter 40 value 1893.568984
## iter 50 value 1532.022670
## iter 60 value 1386.314009
## iter 70 value 1299.649531
## iter 80 value 1231.529229
## iter 90 value 1229.083280
## iter 100 value 1227.208123
## final value 1227.208123
## stopped after 100 iterations
## # weights: 103
## initial value 2737.683875
## iter 10 value 1522.685271
## iter 20 value 1258.101035
## iter 30 value 908.945481
## iter 40 value 776.987774
## iter 50 value 653.338881
## iter 60 value 596.527797
## iter 70 value 555.425433
## iter 80 value 546.058938
## iter 90 value 541.109106
## iter 100 value 530.419401
## final value 530.419401
## stopped after 100 iterations
## # weights: 167
## initial value 2963.507050
## iter 10 value 1076.173598
## iter 20 value 730.381698
## iter 30 value 613.489591
## iter 40 value 505.152481
## iter 50 value 449.603823
## iter 60 value 421.466071
## iter 70 value 394.800503
## iter 80 value 377.292908
## iter 90 value 358.930148
## iter 100 value 323.206335
## final value 323.206335
## stopped after 100 iterations
## # weights: 39
## initial value 2841.000445
## iter 10 value 1908.034664
## iter 20 value 1768.864552
## iter 30 value 1672.265744
## iter 40 value 1458.990458
## iter 50 value 1088.849708
## iter 60 value 852.627679
## iter 70 value 603.280297
## iter 80 value 464.086668
## iter 90 value 461.989253
## iter 100 value 453.246295
## final value 453.246295
## stopped after 100 iterations
## # weights: 103
## initial value 2664.912689
## iter 10 value 1551.506638
## iter 20 value 962.782487
## iter 30 value 763.270148
## iter 40 value 562.277169
## iter 50 value 472.327950
## iter 60 value 414.696474
## iter 70 value 383.160165
## iter 80 value 327.458469
## iter 90 value 243.050555
## iter 100 value 217.961731
## final value 217.961731
## stopped after 100 iterations
## # weights: 167
## initial value 2780.815996
## iter 10 value 1010.293085
## iter 20 value 670.078253
## iter 30 value 448.713707
## iter 40 value 351.929993
## iter 50 value 259.888352
## iter 60 value 187.271020
## iter 70 value 133.209864
## iter 80 value 102.143769
## iter 90 value 90.037575
## iter 100 value 80.783701
## final value 80.783701
## stopped after 100 iterations
## # weights: 39
## initial value 2852.520803
## iter 10 value 2221.282414
## iter 20 value 1769.773123
## iter 30 value 1532.943874
## iter 40 value 1259.111054
## iter 50 value 1135.557569
## iter 60 value 1086.620492
## iter 70 value 1073.385542
## iter 80 value 1041.007305
## iter 90 value 1035.204517
## iter 100 value 1028.294454
## final value 1028.294454
## stopped after 100 iterations
## # weights: 103
## initial value 2857.596157
## iter 10 value 1377.506551
## iter 20 value 888.123076
## iter 30 value 675.626814
## iter 40 value 563.182397
## iter 50 value 444.304347
## iter 60 value 339.784813
## iter 70 value 269.971720
## iter 80 value 214.511717
## iter 90 value 168.270644
## iter 100 value 132.484321
## final value 132.484321
## stopped after 100 iterations
## # weights: 167
## initial value 2940.093879
## iter 10 value 847.627737
## iter 20 value 384.706461
## iter 30 value 207.033658
## iter 40 value 133.152555
## iter 50 value 88.004151
## iter 60 value 65.437889
## iter 70 value 52.251595
## iter 80 value 45.781035
## iter 90 value 41.901719
## iter 100 value 39.699935
## final value 39.699935
## stopped after 100 iterations
## # weights: 39
## initial value 2763.269372
## iter 10 value 2103.053080
## iter 20 value 1740.303652
## iter 30 value 1537.419219
## iter 40 value 1387.209150
## iter 50 value 1330.104410
## iter 60 value 1239.876163
## iter 70 value 1225.702776
## iter 80 value 1225.196102
## iter 90 value 1225.150793
## iter 100 value 1225.108040
## final value 1225.108040
## stopped after 100 iterations
## # weights: 103
## initial value 2851.731745
## iter 10 value 2250.353877
## iter 20 value 1561.479348
## iter 30 value 1328.112294
## iter 40 value 1176.020881
## iter 50 value 957.608664
## iter 60 value 834.973881
## iter 70 value 729.027137
## iter 80 value 657.980954
## iter 90 value 628.036760
## iter 100 value 607.700042
## final value 607.700042
## stopped after 100 iterations
## # weights: 167
## initial value 2871.806202
## iter 10 value 1151.192354
## iter 20 value 623.563067
## iter 30 value 487.631306
## iter 40 value 420.717367
## iter 50 value 383.176857
## iter 60 value 352.115642
## iter 70 value 328.511207
## iter 80 value 309.469306
## iter 90 value 294.801778
## iter 100 value 286.851755
## final value 286.851755
## stopped after 100 iterations
## # weights: 39
## initial value 2688.079010
## iter 10 value 2065.121972
## iter 20 value 1846.800317
## iter 30 value 1737.153769
## iter 40 value 1555.730385
## iter 50 value 1267.025265
## iter 60 value 1028.589046
## iter 70 value 811.976446
## iter 80 value 746.832450
## iter 90 value 739.130230
## iter 100 value 715.321695
## final value 715.321695
## stopped after 100 iterations
## # weights: 103
## initial value 2706.163709
## iter 10 value 1538.836257
## iter 20 value 1029.796927
## iter 30 value 830.898753
## iter 40 value 648.817329
## iter 50 value 501.356096
## iter 60 value 389.105638
## iter 70 value 318.450691
## iter 80 value 284.681443
## iter 90 value 260.098509
## iter 100 value 181.102637
## final value 181.102637
## stopped after 100 iterations
## # weights: 167
## initial value 2795.360875
## iter 10 value 1404.120376
## iter 20 value 908.803423
## iter 30 value 714.063021
## iter 40 value 565.808340
## iter 50 value 456.982911
## iter 60 value 366.381865
## iter 70 value 316.089929
## iter 80 value 274.795289
## iter 90 value 250.171126
## iter 100 value 235.917155
## final value 235.917155
## stopped after 100 iterations
## # weights: 39
## initial value 2733.169920
## iter 10 value 2430.354371
## iter 20 value 2254.682694
## iter 30 value 2150.114628
## iter 40 value 2111.424317
## iter 50 value 2019.843786
## iter 60 value 1948.007851
## iter 70 value 1911.358129
## iter 80 value 1895.482979
## iter 90 value 1887.829742
## iter 100 value 1875.235772
## final value 1875.235772
## stopped after 100 iterations
## # weights: 103
## initial value 2871.766380
## iter 10 value 1274.854730
## iter 20 value 830.040996
## iter 30 value 611.562586
## iter 40 value 477.116741
## iter 50 value 370.461419
## iter 60 value 319.854184
## iter 70 value 301.766885
## iter 80 value 290.542627
## iter 90 value 281.357433
## iter 100 value 273.047597
## final value 273.047597
## stopped after 100 iterations
## # weights: 167
## initial value 2774.518860
## iter 10 value 1202.079898
## iter 20 value 612.305458
## iter 30 value 422.103274
## iter 40 value 292.452799
## iter 50 value 188.723707
## iter 60 value 120.191487
## iter 70 value 90.430640
## iter 80 value 63.889884
## iter 90 value 51.972584
## iter 100 value 43.975424
## final value 43.975424
## stopped after 100 iterations
## # weights: 39
## initial value 2737.814998
## iter 10 value 2463.573895
## iter 20 value 2257.171306
## iter 30 value 1809.239064
## iter 40 value 1502.988420
## iter 50 value 1349.677804
## iter 60 value 1275.252891
## iter 70 value 1256.465634
## iter 80 value 1250.020619
## iter 90 value 1244.107433
## iter 100 value 1233.486017
## final value 1233.486017
## stopped after 100 iterations
## # weights: 103
## initial value 2732.667339
## iter 10 value 1344.312053
## iter 20 value 1070.522165
## iter 30 value 940.755339
## iter 40 value 847.902191
## iter 50 value 723.871097
## iter 60 value 667.542108
## iter 70 value 635.589121
## iter 80 value 609.948981
## iter 90 value 589.383439
## iter 100 value 554.231545
## final value 554.231545
## stopped after 100 iterations
## # weights: 167
## initial value 2935.615373
## iter 10 value 1096.106574
## iter 20 value 780.886425
## iter 30 value 599.424205
## iter 40 value 487.925572
## iter 50 value 451.477339
## iter 60 value 423.986072
## iter 70 value 394.669452
## iter 80 value 367.059912
## iter 90 value 319.483322
## iter 100 value 294.078953
## final value 294.078953
## stopped after 100 iterations
## # weights: 39
## initial value 2753.766514
## iter 10 value 2399.389917
## iter 20 value 1852.713748
## iter 30 value 1456.352549
## iter 40 value 1288.198863
## iter 50 value 1091.413813
## iter 60 value 769.135038
## iter 70 value 632.991918
## iter 80 value 568.093165
## iter 90 value 561.215988
## iter 100 value 532.383445
## final value 532.383445
## stopped after 100 iterations
## # weights: 103
## initial value 2724.248440
## iter 10 value 980.694957
## iter 20 value 649.057481
## iter 30 value 515.000328
## iter 40 value 376.805712
## iter 50 value 273.665936
## iter 60 value 189.291631
## iter 70 value 129.803307
## iter 80 value 110.028231
## iter 90 value 94.918235
## iter 100 value 86.799179
## final value 86.799179
## stopped after 100 iterations
## # weights: 167
## initial value 2956.321489
## iter 10 value 1022.480651
## iter 20 value 440.081374
## iter 30 value 301.519024
## iter 40 value 208.508049
## iter 50 value 156.980922
## iter 60 value 121.836317
## iter 70 value 90.136512
## iter 80 value 75.087575
## iter 90 value 57.145056
## iter 100 value 49.501246
## final value 49.501246
## stopped after 100 iterations
## # weights: 39
## initial value 2750.045778
## iter 10 value 2576.108313
## iter 20 value 2134.538394
## iter 30 value 1733.210237
## iter 40 value 1613.057489
## iter 50 value 1567.196421
## iter 60 value 1378.326177
## iter 70 value 1226.160346
## iter 80 value 1015.601750
## iter 90 value 1004.331873
## iter 100 value 954.712703
## final value 954.712703
## stopped after 100 iterations
## # weights: 103
## initial value 2746.069054
## iter 10 value 1372.240919
## iter 20 value 849.981844
## iter 30 value 660.421859
## iter 40 value 500.304491
## iter 50 value 346.112209
## iter 60 value 247.302449
## iter 70 value 180.235376
## iter 80 value 131.538535
## iter 90 value 109.047934
## iter 100 value 103.024935
## final value 103.024935
## stopped after 100 iterations
## # weights: 167
## initial value 2750.810358
## iter 10 value 882.060029
## iter 20 value 476.765450
## iter 30 value 353.395835
## iter 40 value 243.066177
## iter 50 value 181.575949
## iter 60 value 133.178789
## iter 70 value 94.147916
## iter 80 value 71.058744
## iter 90 value 59.413151
## iter 100 value 52.137165
## final value 52.137165
## stopped after 100 iterations
## # weights: 39
## initial value 2708.153667
## iter 10 value 2312.770437
## iter 20 value 2161.409688
## iter 30 value 2014.330515
## iter 40 value 1864.888166
## iter 50 value 1737.283096
## iter 60 value 1672.330721
## iter 70 value 1583.566306
## iter 80 value 1471.780011
## iter 90 value 1391.969118
## iter 100 value 1306.601186
## final value 1306.601186
## stopped after 100 iterations
## # weights: 103
## initial value 2746.801478
## iter 10 value 1209.715425
## iter 20 value 875.801863
## iter 30 value 729.353353
## iter 40 value 635.302362
## iter 50 value 581.971450
## iter 60 value 540.751459
## iter 70 value 524.755005
## iter 80 value 517.609307
## iter 90 value 514.358196
## iter 100 value 510.907397
## final value 510.907397
## stopped after 100 iterations
## # weights: 167
## initial value 2944.416634
## iter 10 value 965.112681
## iter 20 value 675.734704
## iter 30 value 532.570343
## iter 40 value 402.711856
## iter 50 value 333.380926
## iter 60 value 293.894844
## iter 70 value 270.123118
## iter 80 value 260.981664
## iter 90 value 256.925483
## iter 100 value 253.426019
## final value 253.426019
## stopped after 100 iterations
## # weights: 39
## initial value 2744.661370
## iter 10 value 2260.914389
## iter 20 value 2055.847344
## iter 30 value 1914.314561
## iter 40 value 1847.482344
## iter 50 value 1744.499684
## iter 60 value 1700.814245
## iter 70 value 1629.808616
## iter 80 value 1616.009616
## iter 90 value 1610.275253
## iter 100 value 1597.714788
## final value 1597.714788
## stopped after 100 iterations
## # weights: 103
## initial value 2692.487997
## iter 10 value 1454.385590
## iter 20 value 879.943161
## iter 30 value 705.380339
## iter 40 value 570.637094
## iter 50 value 458.831183
## iter 60 value 366.009173
## iter 70 value 283.038278
## iter 80 value 191.668380
## iter 90 value 133.085836
## iter 100 value 114.111456
## final value 114.111456
## stopped after 100 iterations
## # weights: 167
## initial value 2799.174982
## iter 10 value 983.947216
## iter 20 value 439.332487
## iter 30 value 193.668475
## iter 40 value 105.396817
## iter 50 value 75.092310
## iter 60 value 64.815405
## iter 70 value 59.775512
## iter 80 value 55.786585
## iter 90 value 53.289424
## iter 100 value 52.603977
## final value 52.603977
## stopped after 100 iterations
## # weights: 167
## initial value 3825.463387
## iter 10 value 1350.520046
## iter 20 value 956.334548
## iter 30 value 768.968219
## iter 40 value 645.754467
## iter 50 value 585.899658
## iter 60 value 543.647593
## iter 70 value 513.283876
## iter 80 value 482.516668
## iter 90 value 437.100200
## iter 100 value 345.671509
## final value 345.671509
## stopped after 100 iterations
nn_predictions <- predict(nn_model, newdata = test_data)
metrics <- calculate_metrics(nn_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# k-Nearest Neighbors (kNN)
# Preprocess the data for normalization
preproc <- preProcess(train_data[, sapply(train_data, is.numeric)], method = c("center", "scale"))
train_data_norm <- predict(preproc, train_data)
test_data_norm <- predict(preproc, test_data)
# kNN model (tuneGrid for k values)
knn_model <- train(NObeyesdad ~ ., data = train_data_norm, method = "knn", trControl = train_control, tuneGrid = expand.grid(.k = 5))
knn_predictions <- predict(knn_model, newdata = test_data_norm)
metrics <- calculate_metrics(knn_predictions, test_data$NObeyesdad)
accuracies <- c(accuracies, metrics[1])
# Combine all metrics into a data frame
metrics_data <- data.frame(
Model = model_names,
Accuracy = accuracies
)
# Print the metrics data frame
print(metrics_data)
## Model Accuracy
## 1 Multinomial Logistic Regression 0.9669031
## 2 Naive Bayes 0.7706856
## 3 LDA 0.8983452
## 4 Random Forest 0.9881797
## 5 SVM 0.9007092
## 6 Decision Tree 0.5839243
## 7 Neural Network 0.1725768
## 8 kNN 0.8747045
# Sort the data frame based on accuracies
sorted_metrics_data <- metrics_data %>% arrange(desc(Accuracy))
# Print the sorted metrics data frame
print(sorted_metrics_data)
## Model Accuracy
## 1 Random Forest 0.9881797
## 2 Multinomial Logistic Regression 0.9669031
## 3 SVM 0.9007092
## 4 LDA 0.8983452
## 5 kNN 0.8747045
## 6 Naive Bayes 0.7706856
## 7 Decision Tree 0.5839243
## 8 Neural Network 0.1725768
# Plot the accuracies
ggplot(sorted_metrics_data, aes(x =Accuracy, y = Model, fill = Model)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Accuracy of Different Models",
x = "Model",
y = "Accuracy") +
theme_minimal() +
scale_fill_brewer(palette = "Set3")+theme(axis.text.x = element_text(angle = 45, hjust = 1))
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.